This is a guided analysis to show how to interpret (CD8 T cell) single-cell RNA-seq data using Seurat, TILPred, AUCell and other R packages We use a public dataset of small size (~1K cells) in order to make fast calculations from Singer et al 2016 GEO GSE85947

Preparing the environment

Install required packages using packrat

install.packages("Packrat")
library(packrat)
packrat::on()
packrat::restore()

Import data

Load packages and source files (custom functions)

#install.packages("Seurat")
library(Seurat)
## Warning: replacing previous import 'mclust::dmvnorm' by 'mvtnorm::dmvnorm'
## when loading 'fpc'
source("functions.R")

Read expression matrix (\(log2 (TPM+1 )\))

if(!file.exists("input/GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz")){
download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE86028&format=file&file=GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz",destfile = "input/GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz")
}

m <- read.csv(gzfile("input/GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz"),row.names=1,sep=" ",header=T)
m[1:10,1:5]
##               AH3VMFBGXY_A10_S10 AH3VMFBGXY_A10_S298 AH3VMFBGXY_A11_S203
## 0610005C13Rik             0.0000               0.000              0.0000
## 0610007N19Rik             0.0000               0.000              0.0000
## 0610007P14Rik             0.0000               0.000              0.0000
## 0610008F07Rik             0.0000               0.000              0.0000
## 0610009B14Rik             0.0000               0.000              0.0000
## 0610009B22Rik             0.0000               0.000              0.0000
## 0610009D07Rik             5.0492               6.931              8.4522
## 0610009L18Rik             0.0000               0.000              0.0000
## 0610009O20Rik             0.0000               0.000              0.0000
## 0610010B08Rik             0.0000               0.000              0.0000
##               AH3VMFBGXY_A11_S299 AH3VMFBGXY_A12_S12
## 0610005C13Rik              0.0000                  0
## 0610007N19Rik              0.0000                  0
## 0610007P14Rik              0.0000                  0
## 0610008F07Rik              0.0000                  0
## 0610009B14Rik              0.0000                  0
## 0610009B22Rik              0.0000                  0
## 0610009D07Rik              6.0911                  0
## 0610009L18Rik              0.0000                  0
## 0610009O20Rik              0.0000                  0
## 0610010B08Rik              0.0000                  0

Read meta data

meta <- read.csv(gzfile("input/GSE86028_TILs_sc_wt_mtko.index.txt.gz"),sep="\t",as.is=F,row.names=1)
meta$mouse_id = factor(meta$mouse_id)
meta$plate_uniq_num = factor(meta$plate_uniq_num)
meta$batch_bio = factor(meta$batch_bio)

head(meta)
##                     cell_type condition_plate     seq_id batch_bio
## AH3VMFBGXY_G10_S370       CD8            MTKO AH3VMFBGXY         2
## AH3VMFBGXY_G7_S367        CD8            MTKO AH3VMFBGXY         2
## AH3VMFBGXY_G6_S366        CD8            MTKO AH3VMFBGXY         2
## AH3VMFBGXY_D2_S326        CD8            MTKO AH3VMFBGXY         2
## AH3VMFBGXY_B10_S22        CD8              WT AH3VMFBGXY         2
## HYGHVBGXX_D12_S144        CD8              WT  HYGHVBGXX         2
##                     mouse_id plate_uniq_num
## AH3VMFBGXY_G10_S370        7              1
## AH3VMFBGXY_G7_S367         7              1
## AH3VMFBGXY_G6_S366         7              1
## AH3VMFBGXY_D2_S326         7              1
## AH3VMFBGXY_B10_S22         3              2
## HYGHVBGXX_D12_S144         3              3
str(meta)
## 'data.frame':    1061 obs. of  6 variables:
##  $ cell_type      : Factor w/ 1 level "CD8": 1 1 1 1 1 1 1 1 1 1 ...
##  $ condition_plate: Factor w/ 2 levels "MTKO","WT": 1 1 1 1 2 2 1 1 2 2 ...
##  $ seq_id         : Factor w/ 4 levels "AH3VMFBGXY","H3332BGXY",..: 1 1 1 1 1 4 1 1 1 1 ...
##  $ batch_bio      : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mouse_id       : Factor w/ 9 levels "1","2","3","4",..: 7 7 7 7 3 3 8 7 3 3 ...
##  $ plate_uniq_num : Factor w/ 16 levels "1","2","3","4",..: 1 1 1 1 2 3 4 1 2 2 ...

QC and cell filtering

Create Seurat object

data.seurat <- CreateSeuratObject(m, meta.data = meta, project = "tumorTILS", min.cells = 3,  min.genes = 500, is.expr = 0,do.scale=F, do.center=F)
rm(m,meta)

Explore samples

table(data.seurat@meta.data$mouse_id)
## 
##   1   2   3   4   5   6   7   8   9 
## 130 158 108 138  78  80 133 106 130
summary(data.seurat@meta.data$nGene)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1532    2239    2891    3079    3787    5960
summary(data.seurat@meta.data$nUMI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8959   13733   17073   17835   21256   32145

Caclulate ribosomal content (might indicate techincal variability) (same for mitochondrial content)

ribo.genes <- grep(pattern = "^Rp[ls]", x = rownames(x = data.seurat@data), value = TRUE)
percent.ribo <- Matrix::colSums(data.seurat@raw.data[ribo.genes, ])/Matrix::colSums(data.seurat@raw.data)
summary(percent.ribo)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02911 0.04271 0.05150 0.05308 0.06220 0.09850

Add new ‘metadata’ to our Seurat object

data.seurat <- AddMetaData(object = data.seurat, metadata = percent.ribo, col.name = "percent.ribo")
print(paste("matrix dim",dim(data.seurat@data)))
## [1] "matrix dim 14314" "matrix dim 1061"

In this dataset ‘high quality’ cells were already selected, however we might want to filter further based on these parameters

hist(data.seurat@meta.data$nGene)

hist(data.seurat@meta.data$nUMI)

hist(data.seurat@meta.data$percent.ribo)

Note that Seurat automatically calculates ‘nUMI’ per cell, assuming @data matrix contains UMI counts. In this case however, ‘nUMI’ contains the sum of the normalized expression values (log2 TPM) for each cell

Look at distributions per sample

data.seurat <- SetAllIdent(object = data.seurat, id = "mouse_id")
VlnPlot(object = data.seurat, features.plot = c("nGene", "nUMI","percent.ribo"), nCol = 2,point.size.use=0.001)

print(paste("matrix dim",dim(data.seurat@data)))
## [1] "matrix dim 14314" "matrix dim 1061"
data.seurat <- FilterCells(object = data.seurat, subset.names = c("nGene", "nUMI","percent.ribo"), 
    low.thresholds = c(1500, 8000, -Inf), high.thresholds = c(6000, 30000,0.1))
print(paste("matrix dim",dim(data.seurat@data)))
## [1] "matrix dim 14314" "matrix dim 1056"

There are two groups of mice, Wildtype (WT) and MT KO

table(data.seurat@meta.data$condition_plate)
## 
## MTKO   WT 
##  527  529

Will analyze only cells from WT mice

data.seurat <- SubsetData(data.seurat,cells.use = data.seurat@meta.data$condition_plate=="WT")
table(data.seurat@meta.data$mouse_id)
## 
##   1   2   3   4   5   6   7   8   9 
## 130 158 108 133   0   0   0   0   0

Dimensionality reduction

Extract highly variable genes

set.seed(1234)
 
data.seurat <- FindVariableGenes(
    object = data.seurat,
    mean.function = ExpMean, 
    dispersion.function = LogVMR, 
    x.low.cutoff = 1,
    x.high.cutoff = 15, 
    y.cutoff = 1
)

length(data.seurat@var.genes)
## [1] 1616
head(row.names(data.seurat@hvg.info)[order(data.seurat@hvg.info$gene.mean,decreasing = T)],n=20)

Check expression values of a few genes

VlnPlot(data.seurat,features.plot = qq(Gapdh,Actg1,Actb,Ptprc,Cd2,Cd3g,Cd8a,Cd8b1), point.size.use = exp(-3), x.lab.rot=T)

Some variation but no evident systematic biases

Looking at expression values

plot(density(as.numeric(data.seurat@data["Gzmb",])))

Scale data (useful for example to do PCA on scaled variables, to make heatmaps, etc)

data.seurat <- ScaleData(
    object = data.seurat, do.scale=T, do.center = T
)
## NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## Scaling data matrix
plot(density(as.numeric(data.seurat@scale.data["Gzmb",])))

PCA on highly variable genes

set.seed(1234)
data.seurat <- RunPCA(object = data.seurat, pc.genes = data.seurat@var.genes, do.print = TRUE, pcs.print = 1:5, 
    genes.print = 10)
## [1] "PC1"
##  [1] "Pde2a"   "Pydc3"   "Fam65b"  "Gramd3"  "Itgb7"   "Sell"    "Zfp36l2"
##  [8] "Cd7"     "Cxcr3"   "Pik3ip1"
## [1] ""
##  [1] "Cks1b"     "Tpi1"      "Birc5"     "Ube2c"     "Txn1"     
##  [6] "Rrm2"      "Ccnb2"     "Hist1h2ao" "Cst3"      "Cd74"     
## [1] ""
## [1] ""
## [1] "PC2"
##  [1] "Nkg7"   "Lag3"   "Gzmb"   "Prf1"   "Ctla4"  "Ccl3"   "Ccnb2" 
##  [8] "Hilpda" "Ifng"   "Impa2" 
## [1] ""
##  [1] "Ifi205" "Fgd2"   "Csf2rb" "Ly86"   "Rnase6" "Aif1"   "Tbc1d8"
##  [8] "Plbd1"  "Cd24a"  "Ctsh"  
## [1] ""
## [1] ""
## [1] "PC3"
##  [1] "Ccr7"  "Sell"  "Xcl1"  "Dapl1" "Lad1"  "Gpr18" "Rgcc"  "Cd69" 
##  [9] "Lta"   "Fos"  
## [1] ""
##  [1] "Gzmb"     "Id2"      "Ccl4"     "Lgals3"   "Prf1"     "Ccl3"    
##  [7] "Nkg7"     "AW112010" "Actg1"    "Hilpda"  
## [1] ""
## [1] ""
## [1] "PC4"
##  [1] "Rrm2"      "Spc24"     "Hist2h3c2" "Hist2h3b"  "Birc5"    
##  [6] "Kif22"     "Aurka"     "Fam83d"    "Pkmyt1"    "Hist1h2ao"
## [1] ""
##  [1] "Cd70"   "Cd83"   "Xcl1"   "Ccl1"   "Lad1"   "Cd81"   "Plk2"  
##  [8] "Ccr8"   "Slc2a6" "Lta"   
## [1] ""
## [1] ""
## [1] "PC5"
##  [1] "Laptm4b" "Ccl22"   "Oasl1"   "Fscn1"   "Ptafr"   "Gcnt2"   "Ebi3"   
##  [8] "Cacnb3"  "Mreg"    "Isg15"  
## [1] ""
##  [1] "Apoe"     "C1qa"     "Lyz2"     "C1qc"     "C1qb"     "Fcgr4"   
##  [7] "Plbd1"    "Cd300lf"  "Selenbp1" "Emr1"    
## [1] ""
## [1] ""

We explore the genes with highers loading to have an idea of which ones are driving global variation (e.g. important T cell genes as expected, such as Sell, Lag3, Gzmb, Prf1; cell cycle genes such as Ccnb2, Cks1b, etc )

PCElbowPlot(object = data.seurat)

###
###
Check distribution per sample (batch effects?)

data.seurat <- SetAllIdent(object = data.seurat, id = "mouse_id")
PCAPlot(object = data.seurat)

Check distribution of important genes for our subset

FeaturePlot(data.seurat,features.plot=c("Ptprc","Cd2","Cd8a","Cd8b1","Cd4"),reduction.use="pca",cols.use = c("lightgrey", 
    "blue"),nCol=2,no.legend = F,do.return=T) 

## $Ptprc

## 
## $Cd2

## 
## $Cd8a

## 
## $Cd8b1

## 
## $Cd4

Cell cycle is typically a strong source of transcriptomic variation

FeaturePlot(data.seurat,features.plot=c("Mki67","Ccna2","Cks1b"),reduction.use="pca",cols.use = c("lightgrey", 
    "blue"),nCol=2,no.legend = F,do.return=T)

## $Mki67

## 
## $Ccna2

## 
## $Cks1b

Number of detected genes/reads count per cell associated to amount af mRNA: might be associated to biology (eg cell size, transcriptional activity), cell quality, technical effects, etc

FeaturePlot(data.seurat,features.plot=c("nUMI","nGene","percent.ribo"),reduction.use="pca",cols.use = c("lightgrey", 
    "blue"),nCol=2,no.legend = F,do.return=T) 

## $nUMI

## 
## $nGene

## 
## $percent.ribo

There are some outliers. Not interested (unless looking for rare cell states)

Spot genes among top genes contributing to PCs which might be associated to outliers

FeaturePlot(data.seurat,features.plot=c("H2-Eb1","Ctsh","H2-Aa","H2-Ab1","Csf2rb"),reduction.use="pca",cols.use = c("lightgrey", 
    "blue"),nCol=2,no.legend = F,do.return=T) 

## $`H2-Eb1`

## 
## $Ctsh

## 
## $`H2-Aa`

## 
## $`H2-Ab1`

## 
## $Csf2rb

Indeed, these outliers express high levels of MHC II genes, Csf2rb, etc. suggesting these might be contaminating myeloid/APCs

Filter out outliers based on expression (or lack of) of a few genes (you might try multivariate filtering criteria as well)

data.seurat <- FilterCells(object = data.seurat, subset.names = c("Cd2", "Cd8a","Cd8b1","Cd4","Ctsh"), 
    low.thresholds = c(1, 1, 1,-Inf,-Inf), high.thresholds = c(Inf,Inf,Inf,exp(-10),exp(-10)))
print(paste("matrix dim",dim(data.seurat@data)))
## [1] "matrix dim 14314" "matrix dim 398"
FeaturePlot(data.seurat,features.plot=c("Ptprc","Cd2","Cd8a","Cd8b1","H2-Aa","Csf2rb"),reduction.use="pca",cols.use = c("lightgrey", 
    "blue"),nCol=2,no.legend = F,do.return=T) 

## $Ptprc

## 
## $Cd2

## 
## $Cd8a

## 
## $Cd8b1

## 
## $`H2-Aa`

## 
## $Csf2rb

Outliers were removed

Also there is a strong effect from cell cycle genes, we might want to remove them from our list of highly variable genes used for dimensionality

Get list of cell cycle genes (map to Ensembl IDs)

  #BiocManager::install("org.Mm.eg.db",type="source")
  #BiocManager::install("clusterProfiler")
  #BiocManager::install("DO.db",type="source")
  #BiocManager::install("GO.db",type="source")
  #signatureList <- readRDS("input/signatureList.rds")
  library(org.Mm.eg.db)
  library(clusterProfiler)
  
  idMap <- bitr( rownames(data.seurat@data) , fromType = "SYMBOL", toType = c("ENTREZID"), OrgDb = org.Mm.eg.db)
## Warning in bitr(rownames(data.seurat@data), fromType = "SYMBOL", toType =
## c("ENTREZID"), : 5.71% of input gene IDs are fail to map...
  cellCycle <- mget(c("GO:0007049"),org.Mm.egGO2ALLEGS)
  cellCycle <- unique(unlist(cellCycle))
  cellCycle.symbol <- unique(na.omit(idMap$SYMBOL[match(cellCycle,idMap$ENTREZID)]))
  length(cellCycle.symbol)
## [1] 1293
  print(length(data.seurat@var.genes))
## [1] 1616
  data.seurat@var.genes <- setdiff(data.seurat@var.genes,cellCycle.symbol)
  print(length(data.seurat@var.genes))
## [1] 1494

Dimensionality reduction and clustering

PCA

data.seurat <- ScaleData(object = data.seurat, do.scale=T, do.center = T) #since we filtered out cells, scaling might have slightly changed
## NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## Scaling data matrix
set.seed(1234)
data.seurat <- RunPCA(object = data.seurat, pc.genes = data.seurat@var.genes, do.print = TRUE, pcs.print = 1:5, 
    genes.print = 10)
## [1] "PC1"
##  [1] "Pydc3"   "Pde2a"   "Itgb7"   "Fam65b"  "Sell"    "Cxcr3"   "Gramd3" 
##  [8] "Cd7"     "Pik3ip1" "Igfbp4" 
## [1] ""
##  [1] "Tpi1"      "Lag3"      "Impa2"     "Txn1"      "Gapdh"    
##  [6] "Bcl2a1d"   "Rrm2"      "Hist1h2ao" "Gzmc"      "Mt1"      
## [1] ""
## [1] ""
## [1] "PC2"
##  [1] "Ccr7"   "Sell"   "Xcl1"   "Lad1"   "Dapl1"  "Cd83"   "Slc2a6"
##  [8] "Fos"    "Cd69"   "Cd81"  
## [1] ""
##  [1] "Gzmb"      "Ccl4"      "Nkg7"      "Prf1"      "Lgals3"   
##  [6] "Actg1"     "Ccl3"      "AW112010"  "AA467197"  "Serpina3g"
## [1] ""
## [1] ""
## [1] "PC3"
##  [1] "Ly6c2"     "Rrm2"      "Hist2h3c2" "Hist2h3b"  "Sell"     
##  [6] "Nrm"       "Tgtp1"     "Pde2a"     "Mef2d"     "Hist1h2ao"
## [1] ""
##  [1] "Cd83"    "Cd81"    "Cd70"    "Ccr8"    "Ccl1"    "Xcl1"    "Lag3"   
##  [8] "Bcl2a1d" "Slc2a6"  "Idi2"   
## [1] ""
## [1] ""
## [1] "PC4"
##  [1] "Rrp9"          "Utf1"          "Apoe"          "2410002F23Rik"
##  [5] "Atf3"          "Prdx4"         "H2-Eb1"        "Rrm2"         
##  [9] "Rps4y2"        "Hsph1"        
## [1] ""
##  [1] "Lars2"   "Gm16894" "Zc3h12a" "Lgals9"  "Mcoln1"  "Unc93b1" "Foxo3"  
##  [8] "Kdelc2"  "Irf7"    "Igflr1" 
## [1] ""
## [1] ""
## [1] "PC5"
##  [1] "Gzmd"      "Gzmg"      "Gzme"      "S1pr5"     "Gzmf"     
##  [6] "Klrg1"     "Gzma"      "Gzmc"      "Dtx1"      "Serpinb9b"
## [1] ""
##  [1] "Hist2h3c2" "Hist2h3b"  "Ctla4"     "Ifi27l2a"  "Rtp4"     
##  [6] "Rrm2"      "Hist1h2ao" "AW112010"  "Ifit3"     "Cxcl10"   
## [1] ""
## [1] ""
PCElbowPlot(object = data.seurat)

ndim=10

tSNE

data.seurat <- RunTSNE(object = data.seurat, dims.use = 1:ndim, do.fast = F, seed.use=123, perplexity=30)
#pdf("out/tsne_Markers.pdf",width=10,height=10)
myMarkers <- qq(Sell,Il7r,Ccr7,Tcf7,Gzmb,Fasl,Prf1,Pdcd1,Lag3,Tigit,Havcr2,Entpd1,Mki67,Ccna2)
FeaturePlot(data.seurat,features.plot=myMarkers,reduction.use="tsne",cols.use = c("lightgrey", "blue"),nCol=3,no.legend = F,do.return=T) 

## $Sell

## 
## $Il7r

## 
## $Ccr7

## 
## $Tcf7

## 
## $Gzmb

## 
## $Fasl

## 
## $Prf1

## 
## $Pdcd1

## 
## $Lag3

## 
## $Tigit

## 
## $Havcr2

## 
## $Entpd1

## 
## $Mki67

## 
## $Ccna2

#dev.off()

Visually explore heterogenity of relevant genes

Clustering algorithm implemented in Seruat: shared nearest neighbor (SNN) modularity optimization based clustering algorithm. Important parameters: resolution, dims.use, k.param

set.seed(12345)
data.seurat <- FindClusters(object = data.seurat, reduction.type = "pca", dims.use = 1:ndim, 
    resolution = 0.5, print.output = 0, save.SNN = TRUE, force.recalc = T, k.param = 10)

data.seurat@meta.data$cluster <- factor(data.seurat@meta.data$res.0.5)

Define cluster colors

clusterColors <- gg_color_hue(length(levels(data.seurat@meta.data$cluster)))
names(clusterColors) <- levels(data.seurat@meta.data$cluster)
TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cluster", colors.use = clusterColors)

Hierarchical clustering (might want to try dynamicTreeCut as well)

set.seed(1234)
fit.hclust=hclust(dist(data.seurat@dr$pca@cell.embeddings[,1:ndim]),method="ward.D2")

nclus <- 6
clusters=factor(cutree(fit.hclust,k=nclus))
data.seurat@ident <- clusters
data.seurat <- StashIdent(object = data.seurat, save.name = "wardClustering")
data.seurat <- SetAllIdent(object = data.seurat, id = "wardClustering")
TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "wardClustering")

plot1 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cluster")
plot2 <- TSNEPlot(object = data.seurat, do.return = TRUE, group.by = "wardClustering", 
    no.legend = TRUE, do.label = TRUE)

plot_grid(plot1, plot2)

Good correspondence

K-Means

set.seed(1234)
nclus=6
clusters.kmeans <- kmeans(data.seurat@dr$pca@cell.embeddings[,1:ndim],nclus,nstart = 10)
data.seurat@ident <- factor(clusters.kmeans$cluster)
data.seurat <- StashIdent(object = data.seurat, save.name = "kmeansClustering")
TSNEPlot(object = data.seurat, do.return = TRUE, group.by = "kmeansClustering", 
    no.legend = TRUE, do.label = TRUE)

#ggsave("out/tSNE.0.pdf",width = 3,height = 3)

Also with KMeans

table(data.seurat@meta.data$wardClustering,data.seurat@meta.data$cluster)

# Rand index for cluster agreement

#install.packages("mclust")
library(mclust)
  adjustedRandIndex(data.seurat@meta.data$wardClustering, data.seurat@meta.data$cluster)
  adjustedRandIndex(data.seurat@meta.data$kmeans, data.seurat@meta.data$cluster)
  adjustedRandIndex(data.seurat@meta.data$kmeans, data.seurat@meta.data$wardClustering)
  
  #SNN clustering has more agreement with the other two

Checking co-expression of important genes

p1 <- plot2geneSeuratTSNE(data.seurat,"Pdcd1","Tcf7")

p2 <- plot2geneSeuratTSNE(data.seurat,"Gzmb","Ccr7")

p3 <- plot2geneSeuratTSNE(data.seurat,"Lag3","Sell")

p4 <- plot2geneSeuratTSNE(data.seurat,"Havcr2","Entpd1")

plot_grid(p1,p2,p3,p4)

#ggsave("out/tSNE.markers.pdf",width = 10,height=8)
plot2geneSeuratTSNE(data.seurat,"Pdcd1","Tcf7")

plot2geneSeuratTSNE(data.seurat,"Mki67","Ccna2")

###
###

Sample distribution (biological variablity +batch effect)

for (s in levels(factor(data.seurat@meta.data$mouse_id))) {
  mycol <- data.seurat@meta.data$mouse_id==s
  
  print(ggplot(data=data.frame(data.seurat@dr$tsne@cell.embeddings), aes(x=tSNE_1,y=tSNE_2, color="gray")) + 
          geom_point(color="gray",alpha=0.5) + 
          geom_point(data=data.frame(data.seurat@dr$tsne@cell.embeddings[mycol,]),alpha=0.7, color="blue")  + 
          theme_bw() + theme(aspect.ratio = 1, legend.position="right") + xlab("TSNE 1") + ylab("TSNE 2") + ggtitle(s))
}

Supervised cell state identification

classification of CD8 TIL states using TILPRED

#install.packages("BiocManager")
#install.packages("doParallel")
#install.packages("doRNG")
#BiocManager::install("GenomeInfoDbData",type="source")
#BiocManager::install("AUCell")
#BiocManager::install("SingleCellExperiment",type="source")
#install.packages("remotes")
#remotes::install_github("carmonalab/TILPRED")
library(SingleCellExperiment)
library(AUCell)
library(TILPRED)
data.sce <- Convert(data.seurat,to="sce")
data.sce.pred <- predictTilState(data.sce)
## Warning in predictTilState(data.sce): The following genes were not found
## Gm10282,Cks1brt,Pcna-ps2 . Unknown prediction performance.
## Genes in the gene sets NOT available in the dataset: 
##  NaiveVsExhausted_down:  1 (1% of 97)
##  cycling:    2 (2% of 94)
table(data.sce.pred$predictedState)
## 
##      Naive   Effector MemoryLike  Exhausted    unknown 
##         46         90         68        184         10
str(data.sce.pred)
## List of 3
##  $ predictedState        : Factor w/ 5 levels "Naive","Effector",..: 5 4 4 4 4 3 2 4 4 4 ...
##   ..- attr(*, "names")= chr [1:398] "AH3VMFBGXY_A10_S10" "AH3VMFBGXY_A4_S4" "AH3VMFBGXY_A6_S6" "AH3VMFBGXY_A7_S7" ...
##  $ stateProbabilityMatrix: num [1:398, 1:4] 1.61e-02 2.57e-03 1.00e-06 1.38e-05 5.46e-05 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:398] "AH3VMFBGXY_A10_S10" "AH3VMFBGXY_A4_S4" "AH3VMFBGXY_A6_S6" "AH3VMFBGXY_A7_S7" ...
##   .. ..$ : chr [1:4] "Naive" "Effector" "MemoryLike" "Exhausted"
##  $ cycling               : Named logi [1:398] FALSE FALSE FALSE FALSE FALSE FALSE ...
##   ..- attr(*, "names")= chr [1:398] "AH3VMFBGXY_A10_S10" "AH3VMFBGXY_A4_S4" "AH3VMFBGXY_A6_S6" "AH3VMFBGXY_A7_S7" ...
head(data.sce.pred$stateProbabilityMatrix)
##                           Naive     Effector  MemoryLike Exhausted
## AH3VMFBGXY_A10_S10 1.608488e-02 2.974940e-01 0.359993363 0.3264278
## AH3VMFBGXY_A4_S4   2.567036e-03 4.726579e-01 0.019469265 0.5053058
## AH3VMFBGXY_A6_S6   1.003015e-06 1.676011e-06 0.001023729 0.9989736
## AH3VMFBGXY_A7_S7   1.378348e-05 1.564267e-04 0.004174986 0.9956548
## AH3VMFBGXY_A9_S9   5.460452e-05 9.523511e-04 0.009209985 0.9897831
## AH3VMFBGXY_B11_S23 2.596244e-04 8.337796e-04 0.817061148 0.1818454
data.seurat <- AddMetaData(object = data.seurat, metadata = data.sce.pred$predictedState, col.name = "state.pred")
data.seurat <- AddMetaData(object = data.seurat, metadata = data.sce.pred$cycling, col.name = "cycling")
stateColorsPred <- c("#F8766D","#A58AFF","#00B6EB","#53B400","#000000")
names(stateColorsPred) <- qq(Naive,Effector,MemoryLike,Exhausted,NP) 
TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "state.pred", colors.use =stateColorsPred)

TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cycling")

plot1 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "state.pred", colors.use =stateColorsPred)
plot2 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cycling")
plot_grid(plot1, plot2)

plot1 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "state.pred", colors.use =stateColorsPred)
plot2 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cluster")
plot_grid(plot1, plot2)

Explore association of clusters and predicted states

library(pheatmap)
library(RColorBrewer)
a <- table(data.seurat@meta.data$state.pred,data.seurat@meta.data$cluster)
a.scaled <- scale(a,scale = colSums(a),center=F)*100
a.scaled
##             
##                       0          1          2          3          4
##   Naive        0.000000   3.191489   0.000000  82.692308   0.000000
##   Effector     1.709402  89.361702   0.000000   5.769231   2.083333
##   MemoryLike   6.837607   1.063830  21.739130   3.846154  87.500000
##   Exhausted   88.034188   6.382979  75.362319   0.000000  10.416667
##   unknown      3.418803   0.000000   2.898551   7.692308   0.000000
##             
##                       5
##   Naive        0.000000
##   Effector     0.000000
##   MemoryLike   0.000000
##   Exhausted  100.000000
##   unknown      0.000000
pheatmap(round(a.scaled),cexCol = 0.8,cluster_cols = F)

Cycling cells per cluster

barplot(tapply(data.seurat@meta.data$cycling,data.seurat@meta.data$cluster,mean))

Differential gene expression and gene signature enrichment analysis

#BiocManager::install("MAST")
data.seurat <- SetAllIdent(object = data.seurat, id = "cluster")
cd8.markers <- FindAllMarkers(object = data.seurat, only.pos = TRUE, min.pct = 0.1, min.diff.pct=0.1, logfc.threshold = 0.25, test.use = "bimod",max.cells.per.ident = Inf)#Try MAST
cd8.markers.list <- split(cd8.markers,cd8.markers$cluster)
cd8.markers.list <- lapply((cd8.markers.list),function(x) x$gene)
lapply(cd8.markers.list,head)
## $`0`
## [1] "Tigit"  "Havcr2" "Lag3"   "Id2"    "Cxcr6"  "Prf1"  
## 
## $`1`
## [1] "Ccl5"    "Alox8"   "Gvin1"   "Faim"    "Mettl23" "Ccdc124"
## 
## $`2`
## [1] "2810417H13Rik" "Gmnn"          "Mcm5"          "Hmmr"         
## [5] "Cdc45"         "Brip1"        
## 
## $`3`
## [1] "Sell"    "Cnr2"    "Rapgef4" "Tcf7"    "Satb1"   "Lef1"   
## 
## $`4`
## [1] "Ajuba"    "Zfp202"   "Cd83"     "Ifi27l2a" "Ccr7"     "Xcl1"    
## 
## $`5`
## [1] "Lix1l"  "Gzmf"   "Gzmg"   "Gzmd"   "Agphd1" "Gzmc"
set.seed(1234)

geneSample <- unique(unlist(lapply(cd8.markers.list,function(x) { head(x,n=10) })))

clusterList <- split(row.names(data.seurat@meta.data),data.seurat@meta.data$cluster)

cellSample <- unlist(lapply(clusterList,function(x) { head(x,n=25) }))

ann_col <- data.frame(cluster = data.seurat@meta.data$cluster)
row.names(ann_col) <- row.names(data.seurat@meta.data)

ann_colors <- list(cluster = clusterColors)

pheatmap(data.seurat@data[geneSample,cellSample], cluster_rows = T, cluster_cols = F, labels_col = "",annotation_col = ann_col[cellSample,,drop=F],scale = "row", clustering_distance_rows="correlation", annotation_colors = ann_colors)

myMarkers <- qq(Sell,Il7r,Lef1,S1pr1,Ccr7,Tcf7,Cd44,Ccl5,Gzmk,Cxcr3,Gzmb,Fasl,Prf1,Tox,Batf,Pdcd1,Lag3,Tigit,Havcr2,Entpd1,Mki67,Ccna2)
data.seurat <- SetAllIdent(object = data.seurat, id = "cluster")
DotPlot(data.seurat,genes.plot = myMarkers,x.lab.rot = 1,plot.legend =T,cols.use = "RdYlGn")

#ggsave("out/DotPlot.pdf",width = 10, height = 3)

Define a few example gene signatures

signatureList <- list()
signatureList$Cytotoxicity <- c("Prf1","Fasl","Gzmb")
signatureList$Stemness <- c("Tcf7","Sell","Il7r","Lef1")
signatureList$InhibitoryReceptors <- c("Pdcd1","Havcr2","Tigit","Lag3","Ctla4")
signatureList$cellCycleGenes <- cellCycle.symbol

Use AUCell package to calculate signature enrichment

#BiocManager::install("AUCell")
library(AUCell)
set.seed(1234)
cells_rankings <- AUCell::AUCell_buildRankings(as.matrix(data.seurat@data), nCores=2, plotStats=TRUE)
## Quantiles for the number of genes detected by cell: 
## (Non-detected genes are shuffled at the end of the ranking. Keep it in mind when choosing the threshold for calculating the AUC).
##     min      1%      5%     10%     50%    100% 
## 1576.00 1595.94 1719.00 1885.80 2901.50 5786.00
## Using 2 cores.

cells_AUC <- AUCell_calcAUC(signatureList, cells_rankings, aucMaxRank=1500)
print(cells_AUC)
## AUC for 4 gene sets (rows) and 398 cells (columns).
## 
## Top-left corner of the AUC matrix:
##                      cells
## gene sets             AH3VMFBGXY_A10_S10 AH3VMFBGXY_A4_S4 AH3VMFBGXY_A6_S6
##   Cytotoxicity                0.44570538        0.9178905        0.7725857
##   Stemness                    0.14173623        0.0000000        0.0000000
##   InhibitoryReceptors         0.63233133        0.7472278        0.7910488
##   cellCycleGenes              0.09842882        0.1018561        0.1251876
##                      cells
## gene sets             AH3VMFBGXY_A7_S7 AH3VMFBGXY_A9_S9
##   Cytotoxicity               0.7405429        0.6030263
##   Stemness                   0.0000000        0.0000000
##   InhibitoryReceptors        0.6120240        0.6452906
##   cellCycleGenes             0.1054429        0.1094186
data.seurat@meta.data <- data.seurat@meta.data[!grepl("^AUC",colnames(data.seurat@meta.data))]
data.seurat <- AddMetaData(data.seurat, metadata = t(assays(cells_AUC)$AUC), col.name = paste0("AUC_",rownames(cells_AUC)))
for (s in names(signatureList)) {

  print(myFeaturePlotAUC(data.seurat,s,cells_AUC))

}

for (s in names(signatureList)) {
  print(VlnPlot(object = data.seurat, features.plot = paste0("AUC_",s), point.size.use=0.001, cols.use=clusterColors, group.by = "cluster"))
}

#BiocManager::install("iSEE")
library(iSEE)
app <- iSEE(data.sce)
shiny::runApp(app)
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] doRNG_1.7.1                 rngtools_1.3.1             
##  [3] pkgmaker_0.27               registry_0.5               
##  [5] foreach_1.4.4               RColorBrewer_1.1-2         
##  [7] pheatmap_1.0.12             TILPRED_0.1.0              
##  [9] AUCell_1.4.1                SingleCellExperiment_1.4.1 
## [11] SummarizedExperiment_1.12.0 DelayedArray_0.8.0         
## [13] BiocParallel_1.16.5         matrixStats_0.54.0         
## [15] GenomicRanges_1.34.0        GenomeInfoDb_1.18.1        
## [17] clusterProfiler_3.10.1      org.Mm.eg.db_3.7.0         
## [19] AnnotationDbi_1.44.0        IRanges_2.16.0             
## [21] S4Vectors_0.20.1            Biobase_2.42.0             
## [23] BiocGenerics_0.28.0         tidyr_1.0.0                
## [25] dplyr_0.8.3                 Seurat_2.3.4               
## [27] Matrix_1.2-17               cowplot_0.9.4              
## [29] ggplot2_3.1.0              
## 
## loaded via a namespace (and not attached):
##   [1] reticulate_1.10        R.utils_2.7.0          tidyselect_0.2.5      
##   [4] RSQLite_2.1.1          htmlwidgets_1.3        grid_3.5.2            
##   [7] trimcluster_0.1-2.1    Rtsne_0.15             munsell_0.5.0         
##  [10] codetools_0.2-16       ica_1.0-2              units_0.6-2           
##  [13] withr_2.1.2            colorspace_1.4-0       GOSemSim_2.8.0        
##  [16] knitr_1.20             rstudioapi_0.9.0       ROCR_1.0-7            
##  [19] robustbase_0.93-5      dtw_1.20-1             DOSE_3.8.2            
##  [22] gbRd_0.4-11            Rdpack_0.10-1          labeling_0.3          
##  [25] lars_1.2               urltools_1.7.1         GenomeInfoDbData_1.2.0
##  [28] bit64_0.9-7            farver_1.1.0           vctrs_0.2.0           
##  [31] doParallel_1.0.14      diptest_0.75-7         R6_2.3.0              
##  [34] hdf5r_1.2.0            flexmix_2.3-14         bitops_1.0-6          
##  [37] fgsea_1.8.0            gridGraphics_0.3-0     assertthat_0.2.0      
##  [40] promises_1.0.1         SDMTools_1.1-221       scales_1.0.0          
##  [43] ggraph_1.0.2           nnet_7.3-12            enrichplot_1.2.0      
##  [46] gtable_0.2.0           npsurv_0.4-0           rlang_0.4.0           
##  [49] zeallot_0.1.0          splines_3.5.2          lazyeval_0.2.1        
##  [52] acepack_1.4.1          europepmc_0.3          checkmate_1.9.1       
##  [55] yaml_2.2.0             reshape2_1.4.3         backports_1.1.3       
##  [58] httpuv_1.4.5.1         qvalue_2.14.1          Hmisc_4.2-0           
##  [61] tools_3.5.2            ggplotify_0.0.3        ellipsis_0.2.0.1      
##  [64] gplots_3.0.1           proxy_0.4-22           ggridges_0.5.1        
##  [67] Rcpp_1.0.2             plyr_1.8.4             base64enc_0.1-3       
##  [70] progress_1.2.0         zlibbioc_1.28.0        purrr_0.3.0           
##  [73] RCurl_1.95-4.11        prettyunits_1.0.2      rpart_4.1-13          
##  [76] pbapply_1.3-4          viridis_0.5.1          zoo_1.8-4             
##  [79] ggrepel_0.8.0          cluster_2.0.7-1        magrittr_1.5          
##  [82] data.table_1.12.2      DO.db_2.9              lmtest_0.9-36         
##  [85] triebeard_0.3.0        RANN_2.6.1             mvtnorm_1.0-11        
##  [88] packrat_0.5.0          fitdistrplus_1.0-14    mime_0.6              
##  [91] xtable_1.8-3           hms_0.4.2              lsei_1.2-0            
##  [94] evaluate_0.12          XML_3.98-1.16          mclust_5.4.5          
##  [97] gridExtra_2.3          compiler_3.5.2         tibble_2.1.3          
## [100] KernSmooth_2.23-15     crayon_1.3.4           R.oo_1.22.0           
## [103] htmltools_0.3.6        later_0.7.5            segmented_0.5-3.0     
## [106] Formula_1.2-3          snow_0.4-3             DBI_1.0.0             
## [109] tweenr_1.0.1           MASS_7.3-51.1          fpc_2.1-11.1          
## [112] R.methodsS3_1.7.1      gdata_2.18.0           metap_1.0             
## [115] igraph_1.2.2           pkgconfig_2.0.2        rvcheck_0.1.3         
## [118] foreign_0.8-71         xml2_1.2.2             annotate_1.60.0       
## [121] XVector_0.22.0         bibtex_0.4.2           stringr_1.3.1         
## [124] digest_0.6.18          tsne_0.1-3             graph_1.60.0          
## [127] rmarkdown_1.11         fastmatch_1.1-0        htmlTable_1.13.1      
## [130] GSEABase_1.44.0        kernlab_0.9-27         shiny_1.2.0           
## [133] gtools_3.8.1           modeltools_0.2-22      lifecycle_0.1.0       
## [136] nlme_3.1-137           jsonlite_1.6           viridisLite_0.3.0     
## [139] pillar_1.3.1           lattice_0.20-38        httr_1.4.0            
## [142] DEoptimR_1.0-8         survival_2.43-3        GO.db_3.7.0           
## [145] glue_1.3.0             UpSetR_1.3.3           png_0.1-7             
## [148] prabclus_2.2-7         iterators_1.0.10       bit_1.1-14            
## [151] ggforce_0.1.3          class_7.3-15           stringi_1.2.4         
## [154] mixtools_1.1.0         blob_1.1.1             doSNOW_1.0.16         
## [157] latticeExtra_0.6-28    caTools_1.17.1.1       memoise_1.1.0         
## [160] irlba_2.3.3            ape_5.2